setwd("/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Data")
library(lme4)
library(foreign)
options(scipen=999)

# load BNS data by year
bns06 <- read.csv("2006_BNS_for_Analysis_with_Weights.csv", header=T)
bns08 <- read.csv("2008_BNS_for_Analysis_with_Weights.csv", header=T)
bns0810 <- read.csv("2008-2010_BNS_Combined_with_Weights.csv", header=T)
bns10 <- read.csv("2010_BNS_for_Analysis_with_Weights.csv", header=T)

# load BG_IDs (year 2000)
cbgs <- read.dbf("census2000blockgroups_poly.dbf")
head(cbgs)
cbgs$BG_ID_00 <- cbgs$BG_ID
cbgs <- cbgs[,c("BG_ID_00","BG_ID")]

# load CT_IDs (year 2000)
cts <- read.dbf("census2000tracts_poly.dbf")
head(cts)
cts$CT_ID_00 <- cts$CT_ID
cts <- cts[,c("CT_ID_00","CT_ID")]

# change 98's to NA's
bns06[bns06==98] <- NA
bns08[bns08==98] <- NA
bns10[bns10==98] <- NA
bns0810[bns0810==98] <- NA

summary(bns06)
summary(bns08)
summary(bns10)
summary(bns0810)

# Create Individual-Level Files with NA's instead of 98's
write.csv(bns06, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Files for Dataverse/Individual Years/2006_BNS_for_Analysis_with_Weights_032917.csv", row.names=F)
write.csv(bns08, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Files for Dataverse/Individual Years/2008_BNS_for_Analysis_with_Weights_032917.csv", row.names=F)
write.csv(bns10, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Files for Dataverse/Individual Years/2010_BNS_for_Analysis_with_Weights_032917.csv", row.names=F)
write.csv(bns0810, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Files for Dataverse/Individual Years/2008-2010_Combined_BNS_for_Analysis_with_Weights_032917.csv", row.names=F)



#######################################
# 2008-2010 Combined Data for CBGs
#######################################

#create RRACE.new variable
table(bns0810$RRACE,bns0810$RHISP)
bns0810$RRACE.new<-rep(NA,nrow(bns0810))
bns0810$RRACE.new[bns0810$RRACE==4]<-5 #other
bns0810$RRACE.new[bns0810$RRACE==3]<-3 #asian
bns0810$RRACE.new[bns0810$RRACE==2]<-2 #white
bns0810$RRACE.new[bns0810$RRACE==1]<-1 #black
bns0810$RRACE.new[bns0810$RHISP==1]<-4 #hispanic
bns0810$RRACE.new[bns0810$RRACE==5 & bns0810$RHISP!=1]<-NA
table(bns0810$RRACE.new)

# calculate how many missing in ChildBin or RAGE
sum(is.na(bns0810$ChildBin) | is.na(bns0810$RAGE)) #45 out of 3428

#create BLACK, ASIAN, and HISPANIC variables
bns0810$BLACK<-rep(0,nrow(bns0810))
bns0810$BLACK[bns0810$RRACE.new==1]<-1
table(bns0810$BLACK)

bns0810$ASIAN<-rep(0,nrow(bns0810))
bns0810$ASIAN[bns0810$RRACE.new==3]<-1
table(bns0810$ASIAN)

bns0810$HISPANIC<-rep(0,nrow(bns0810))
bns0810$HISPANIC[bns0810$RRACE.new==4]<-1
table(bns0810$HISPANIC)
table(bns0810$RRACE.new)

#estimating Bayes residuals, model specification= ChildBin, RAGE, RGENDR, BLACK, ASIAN, HISPANIC 

# create data set for analysis: bns0810a - removing rows with missing values of BG_ID, RAGE, ChildBin, and RRACE.new
bns0810a <- bns0810[!is.na(bns0810$BG_ID) & !is.na(bns0810$RAGE) & !is.na(bns0810$ChildBin)  & !is.na(bns0810$RRACE.new),]

# creating a formula for the model and extracting the Bayes residuals
colnames(bns0810a)
length(unique(bns0810a$BG_ID))

# limit to pertinent CBGs
cbgs <- cbgs[cbgs$BG_ID_00 %in% unique(bns0810a$BG_ID),]


for (i in c(109:118,126,127)){
  data <- bns0810a[!is.na(bns0810a[,i]),]
  model <- lmer(data[,i] ~ data$ChildBin + data$RAGE + data$RGENDR + data$BLACK + data$ASIAN + data$HISPANIC +
              (1 | data$BG_ID))
  a<-coef(model)[[1]]
  a$BG_ID<-row.names(a)
  names(a)[1]<-paste(names(bns0810)[i], "0810", sep="_")
  cbgs<-merge(cbgs,a[c(1,8)],by="BG_ID",all.x=TRUE)
}

cbgs$BG_ID <- NULL
head(cbgs)
summary(cbgs)
dim(cbgs)

# calculate collective efficacy
cbgs$CollEff_0810 <- (cbgs$soccoh_0810 + cbgs$soccon_0810)/2

# create final file of measures
write.csv(cbgs, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Data to Publish/2008-10 Combined Measures by Block Group.csv", row.names=F)

#######################################
# 2006 Measures for CTs
#######################################

#create RRACE.new variable
table(bns06$RRACE,bns06$RHISP)
bns06$RRACE.new<-rep(NA,nrow(bns06))
bns06$RRACE.new[bns06$RRACE==4]<-5 #other
bns06$RRACE.new[bns06$RRACE==3]<-3 #asian
bns06$RRACE.new[bns06$RRACE==2]<-2 #white
bns06$RRACE.new[bns06$RRACE==1]<-1 #black
bns06$RRACE.new[bns06$RHISP==1]<-4 #hispanic
bns06$RRACE.new[bns06$RRACE==5 & bns06$RHISP!=1]<-NA
table(bns06$RRACE.new)

# calculate how many missing in ChildBin or RAGE
sum(is.na(bns06$ChildBin) | is.na(bns06$RAGE)) #25 out of 1707

#create BLACK, ASIAN, and HISPANIC variables
bns06$BLACK<-rep(0,nrow(bns06))
bns06$BLACK[bns06$RRACE.new==1]<-1
table(bns06$BLACK)

bns06$ASIAN<-rep(0,nrow(bns06))
bns06$ASIAN[bns06$RRACE.new==3]<-1
table(bns06$ASIAN)

bns06$HISPANIC<-rep(0,nrow(bns06))
bns06$HISPANIC[bns06$RRACE.new==4]<-1
table(bns06$HISPANIC)
table(bns06$RRACE.new)

#estimating Bayes residuals, model specification= ChildBin, RAGE, RGENDR, BLACK, ASIAN, HISPANIC 

# create data set for analysis: bns06a - removing rows with missing values of BG_ID, RAGE, ChildBin, and RRACE.new
bns06a <- bns06[!is.na(bns06$CT_ID) & !is.na(bns06$RAGE) & !is.na(bns06$ChildBin)  & !is.na(bns06$RRACE.new),]

# creating a formula for the model and extracting the Bayes residuals
colnames(bns06a)
length(unique(bns06a$CT_ID))
# columns of interest: 94-101, 104-106

# limit to pertinent CTs
ct06 <- cts[cts$CT_ID_00 %in% unique(bns06a$CT_ID),]


for (i in c(94:101,104:106)){
  data <- bns06a[!is.na(bns06a[,i]),]
  model <- lmer(data[,i] ~ data$ChildBin + data$RAGE + data$RGENDR + data$BLACK + data$ASIAN + data$HISPANIC +
                  (1 | data$CT_ID))
  a<-coef(model)[[1]]
  a$CT_ID<-row.names(a)
  names(a)[1]<-paste(names(bns06a)[i], "06", sep="_")
  ct06<-merge(ct06,a[c(1,8)],by="CT_ID",all.x=TRUE)
}

ct06$CT_ID <- NULL
head(ct06)
summary(ct06)
dim(ct06)

# calculate collective efficacy
ct06$CollEff_06 <- (ct06$soccoh_06 + ct06$soccon_06)/2

# create final file of measures
write.csv(ct06, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Data to Publish/2006 Measures by Tract.csv", row.names=F)



#######################################
# 2008 Measures for CTs
#######################################

#create RRACE.new variable
table(bns08$RRACE,bns08$RHISP)
bns08$RRACE.new<-rep(NA,nrow(bns08))
bns08$RRACE.new[bns08$RRACE==4]<-5 #other
bns08$RRACE.new[bns08$RRACE==3]<-3 #asian
bns08$RRACE.new[bns08$RRACE==2]<-2 #white
bns08$RRACE.new[bns08$RRACE==1]<-1 #black
bns08$RRACE.new[bns08$RHISP==1]<-4 #hispanic
bns08$RRACE.new[bns08$RRACE==5 & bns08$RHISP!=1]<-NA
table(bns08$RRACE.new)

#create BLACK, ASIAN, and HISPANIC variables
bns08$BLACK<-rep(0,nrow(bns08))
bns08$BLACK[bns08$RRACE.new==1]<-1
table(bns08$BLACK)

bns08$ASIAN<-rep(0,nrow(bns08))
bns08$ASIAN[bns08$RRACE.new==3]<-1
table(bns08$ASIAN)

bns08$HISPANIC<-rep(0,nrow(bns08))
bns08$HISPANIC[bns08$RRACE.new==4]<-1
table(bns08$HISPANIC)
table(bns08$RRACE.new)

#estimating Bayes residuals, model specification= ChildBin, RAGE, RGENDR, BLACK, ASIAN, HISPANIC 

# create data set for analysis: bns08a - removing rows with missing values of BG_ID, RAGE, ChildBin, and RRACE.new
bns08a <- bns08[!is.na(bns08$CT_ID) & !is.na(bns08$RAGE) & !is.na(bns08$ChildBin)  & !is.na(bns08$RRACE.new),]

# creating a formula for the model and extracting the Bayes residuals
colnames(bns08a)
length(unique(bns08a$CT_ID))
# columns of interest: c(107:117, 125:126)
head(bns08a[,c(108:117, 125:126)])
# limit to pertinent CTs
ct08 <- cts[cts$CT_ID_00 %in% unique(bns08a$CT_ID),]


for (i in c(108:117, 125:126)){
  data <- bns08a[!is.na(bns08a[,i]),]
  model <- lmer(data[,i] ~ data$ChildBin + data$RAGE + data$RGENDR + data$BLACK + data$ASIAN + data$HISPANIC +
                  (1 | data$CT_ID))
  a<-coef(model)[[1]]
  a$CT_ID<-row.names(a)
  names(a)[1]<-paste(names(bns08a)[i], "08", sep="_")
  ct08<-merge(ct08,a[c(1,8)],by="CT_ID",all.x=TRUE)
}

ct08$CT_ID <- NULL
head(ct08)
summary(ct08)
dim(ct08)

# calculate collective efficacy
ct08$CollEff_08 <- (ct08$soccoh_08 + ct08$soccon_08)/2

# create final file of measures
write.csv(ct08, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Data to Publish/2008 Measures by Tract.csv", row.names=F)





#######################################
# 2010 Measures for CTs
#######################################

#create RRACE.new variable
table(bns10$RRACE,bns10$RHISP)
bns10$RRACE.new<-rep(NA,nrow(bns10))
bns10$RRACE.new[bns10$RRACE==4]<-5 #other
bns10$RRACE.new[bns10$RRACE==3]<-3 #asian
bns10$RRACE.new[bns10$RRACE==2]<-2 #white
bns10$RRACE.new[bns10$RRACE==1]<-1 #black
bns10$RRACE.new[bns10$RHISP==1]<-4 #hispanic
bns10$RRACE.new[bns10$RRACE==5 & bns10$RHISP!=1]<-NA
table(bns10$RRACE.new)

# calculate how many missing in ChildBin or RAGE
sum(is.na(bns10$ChildBin) | is.na(bns10$RAGE)) #25 out of 1707

#create BLACK, ASIAN, and HISPANIC variables
bns10$BLACK<-rep(0,nrow(bns10))
bns10$BLACK[bns10$RRACE.new==1]<-1
table(bns10$BLACK)

bns10$ASIAN<-rep(0,nrow(bns10))
bns10$ASIAN[bns10$RRACE.new==3]<-1
table(bns10$ASIAN)

bns10$HISPANIC<-rep(0,nrow(bns10))
bns10$HISPANIC[bns10$RRACE.new==4]<-1
table(bns10$HISPANIC)
table(bns10$RRACE.new)

#estimating Bayes residuals, model specification= ChildBin, RAGE, RGENDR, BLACK, ASIAN, HISPANIC 

# create data set for analysis: bns10a - removing rows with missing values of BG_ID, RAGE, ChildBin, and RRACE.new
bns10a <- bns10[!is.na(bns10$CT_ID) & !is.na(bns10$RAGE) & !is.na(bns10$ChildBin)  & !is.na(bns10$RRACE.new),]

# creating a formula for the model and extracting the Bayes residuals
colnames(bns10a)
length(unique(bns10a$CT_ID))
# columns of interest: c(105:114, 121:123)

# limit to pertinent CTs
ct10 <- cts[cts$CT_ID_00 %in% unique(bns10a$CT_ID),]


for (i in c(105:114, 121:123)){
  data <- bns10a[!is.na(bns10a[,i]),]
  model <- lmer(data[,i] ~ data$ChildBin + data$RAGE + data$RGENDR + data$BLACK + data$ASIAN + data$HISPANIC +
                  (1 | data$CT_ID))
  a<-coef(model)[[1]]
  a$CT_ID<-row.names(a)
  names(a)[1]<-paste(names(bns10a)[i], "10", sep="_")
  ct10<-merge(ct10,a[c(1,8)],by="CT_ID",all.x=TRUE)
}

ct10$CT_ID <- NULL
head(ct10)
summary(ct10)
dim(ct10)

# calculate collective efficacy
ct10$CollEff_10 <- (ct10$soccoh_10 + ct10$soccon_10)/2

# create final file of measures
write.csv(ct10, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Data to Publish/2010 Measures by Tract.csv", row.names=F)

#######################################
# CREATE FULL LONGITUDINAL FILE
#######################################

# ct06, ct08, ct10
ctall1 <- merge(ct06,ct08, by="CT_ID_00", all.x=T, all.y=T)
ctall <- merge(ctall1, ct10, by="CT_ID_00", all.x=T, all.y=T)
colnames(ctall)
dim(ctall) # 156 tracts, 40 variables

write.csv(ctall, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Data to Publish/BNS Longitudinal Measures by Tract.csv", row.names=F)

ct0810 <- merge(ct08, ct10, by="CT_ID_00", all.x=T, all.y=T)
write.csv(ct0810, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Data to Publish/BNS 2008 and 2010 Measures by Tract.csv", row.names=F)






#######################################
# CREATE 2008 and 2010 individual files for translation to 2010 census geographies
#######################################

# need to calculate number of non-NA respondents for each measure by year and CT_ID

# use the bns data that was analyzed as it's most appropriate
tdata08 <- bns08a
tdata08 <- tdata08[,c(127, 108:117, 125:126)]

# create a number of respondent data set
resp08 <- ct08

# replace all values of measures with their respondent values
for (i in 1:nrow(ct08)){
  for (j in 2:ncol(tdata08)){
  resp08[i,j] <- sum(tdata08$CT_ID==resp08$CT_ID_00[i] & !is.na(tdata08[,j]))
  }}

summary(resp08)

# fix collective efficacy
resp08$CollEff_08 <- ifelse(resp08$soccoh_08==resp08$soccon_08, resp08$soccoh_08, NA)
summary(resp08)

# change column names
colnames(resp08)[2:ncol(resp08)] <- gsub("_08","_nresp_08",colnames(resp08)[2:ncol(resp08)])



# REPEAT FOR 2010
# use the bns data that was analyzed as it's most appropriate
tdata10 <- bns10a
colnames(tdata10)
tdata10 <- tdata10[,c(124, 105:114, 121:123)]

# create a number of respondent data set
resp10 <- ct10

# replace all values of measures with their respondent values
for (i in 1:nrow(ct10)){
  for (j in 2:ncol(tdata10)){
    resp10[i,j] <- sum(tdata10$CT_ID==resp10$CT_ID_00[i] & !is.na(tdata10[,j]))
  }}

summary(resp10)

# fix collective efficacy
resp10$CollEff_10 <- ifelse(resp10$soccoh_10==resp10$soccon_10, resp10$soccoh_10, NA)
summary(resp10) # check for NA's, fix if needed though it shouldn't be needed

# change column names
colnames(resp10)[2:ncol(resp10)] <- gsub("_10","_nresp_10",colnames(resp10)[2:ncol(resp10)])


# create a final data set for translation with 08, 10 measures and number of respondents
trans_08_f <- merge(ct08, resp08, by="CT_ID_00")
head(trans_08_f)
trans_10_f <- merge(ct10, resp10, by="CT_ID_00")
head(trans_10_f)

# create files for translation in STATA
write.csv(trans_08_f, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Updated Translation/2008_Measures_and_NResp_for_Translation.csv", row.names = F)
write.csv(trans_10_f, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Updated Translation/2010_Measures_and_NResp_for_Translation.csv", row.names = F)

# because of missing gangact data, had to do it separately for 2008
trans_08_f_gang <- trans_08_f[!is.na(trans_08_f$gangact_08), c("CT_ID_00","gangact_08","gangact_nresp_08")]
write.csv(trans_08_f_gang, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Updated Translation/2008_GANG_Measures_and_NResp_for_Translation.csv", row.names = F)


#######################################
# Create longitudinal file for translated data
#######################################

# after running the .do files in STATA, the following are the results

t08 <- read.csv("/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Updated Translation/BNS_Measures_Translated_2008_032917.csv", header = T)
t08g <- read.csv("/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Updated Translation/BNS_Measures_Translated_GANG_2008_032917.csv", header = T)
t10 <- read.csv("/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Updated Translation/BNS_Measures_Translated_2010_032917.csv", header = T)

# merge 2008 data with it's gangact variable from t08g and remove the "number of respondents" variables for both 2008 and 2010
colnames(t08)
t08.final <- merge(t08, t08g, by="trtid10", all.x=T, all.y=T)
colnames(t08.final)
colnames(ct0810)
t08.final <- t08.final[,c(1:3,26,4:13)]
colnames(t08.final)
colnames(t10)
t10 <- t10[,c(1:15)]

tfinal <- merge(t08.final, t10, by="trtid10", all.x=T, all.y=T)
summary(tfinal)

# check if the order of variables matches
dim(tfinal)
dim(ct0810)

# rename for consistency
colnames(tfinal)[1]<-"CT_ID_10"
colnames(tfinal)[2:ncol(tfinal)] <- colnames(ct0810)[2:ncol(ct0810)]
colnames(tfinal)

write.csv(tfinal, "/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Data to Publish/Translated BNS 2008 and 2010 Measures by 2010 Tract.csv", row.names=F)


#######################################
# CREATE A SHAPEFILE FOR THE 2010 TRANSLATED DATA
#######################################
bnst10 <- tfinal[,c(1,15:28)]
#bnst10 <- read.csv("/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Data to Publish/Translated BNS 2008 and 2010 Measures by 2010 Tract.csv", header=T)


library(rgdal)

setwd("~/Desktop/Work_BARI/Geographical Infrastructure/Tracts")
bos.tr <- readOGR(".", "Tracts_Boston_2010_BARI")

bns.shape <- merge(bos.tr, bnst10, by = "CT_ID_10", all.x = T, all.y = T)
summary(bns.shape)

setwd("/Users/alexandraciomek/Desktop/Work_BARI/BNS Work/2017/Final Analyses/Updated Translation/Shape")
writeOGR(bns.shape, ".", "2010_BNS_for_BostonMap_CT_ID_10", driver = "ESRI Shapefile")




